“Ask a home buyer to describe their dream house, and they probably won’t begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition’s dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this problem challenges you to predict the final price of each home.
The potential for creative feature engineering provides a rich opportunity for fun and learning. This dataset lends itself to advanced regression techniques like random forests and gradient boosting with the popular XGBoost library."
## load training data
house_train <- read.csv("./train.csv",
header = TRUE,
na.strings = "",
stringsAsFactors = FALSE)
## load test data
house_test <- read.csv("./test.csv",
header = TRUE,
na.strings = "",
stringsAsFactors = FALSE)
dim(house_train)
## [1] 1460 81
dim(house_test)
## [1] 1459 80
str(house_train)
## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : chr "65" "80" "68" "60" ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr "NA" "NA" "NA" "NA" ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : chr "196" "0" "162" "0" ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr "NA" "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : chr "2003" "1976" "2001" "1998" ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr "NA" "NA" "NA" "NA" ...
## $ Fence : chr "NA" "NA" "NA" "NA" ...
## $ MiscFeature : chr "NA" "NA" "NA" "NA" ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
## get levels of categorical features from data description
factorLevel <- list()
conn <- file("./data_description.txt", open="r")
f <-readLines(conn)
for (line in f){
if(!grepl("^[[:blank:]]", line) & grepl(": ", line)) {
col_name <<- trimws(gsub(":.*", "", line))
} else {
level <- trimws(gsub("\t.*", "", line))
if (level != "") {
factorLevel[[col_name]] <- c(factorLevel[[col_name]], level)
}
}
}
close(conn)
print(factorLevel[1:6])
## $MSSubClass
## [1] "20" "30" "40" "45" "50" "60" "70" "75" "80" "85" "90"
## [12] "120" "150" "160" "180" "190"
##
## $MSZoning
## [1] "A" "C" "FV" "I" "RH" "RL" "RP" "RM"
##
## $Street
## [1] "Grvl" "Pave"
##
## $Alley
## [1] "Grvl" "Pave" "NA"
##
## $LotShape
## [1] "Reg" "IR1" "IR2" "IR3"
##
## $LandContour
## [1] "Lvl" "Bnk" "HLS" "Low"
## check if levels in description cover unique data values
for (varname in names(factorLevel)) {
levelDiff <- setdiff(unique(house_train[[varname]]),
factorLevel[[varname]])
if(length(levelDiff)) {
print(paste(varname,
paste(levelDiff, collapse = ", "),
sep = ": "))
}
}
## [1] "MSZoning: C (all)"
## [1] "Neighborhood: NAmes"
## [1] "BldgType: 2fmCon, Duplex, Twnhs"
## [1] "Exterior2nd: Wd Shng, CmentBd, Brk Cmn"
## [1] "MasVnrType: NA"
## [1] "Electrical: NA"
## fix those levels that don't match with data
## ignore "NA" as they will be considered as missing when converting categorical to factors
unique(house_train$MSZoning)
## [1] "RL" "RM" "C (all)" "FV" "RH"
factorLevel$MSZoning
## [1] "A" "C" "FV" "I" "RH" "RL" "RP" "RM"
factorLevel$MSZoning[2] <- "C (all)"
unique(house_train$Neighborhood)
## [1] "CollgCr" "Veenker" "Crawfor" "NoRidge" "Mitchel" "Somerst" "NWAmes"
## [8] "OldTown" "BrkSide" "Sawyer" "NridgHt" "NAmes" "SawyerW" "IDOTRR"
## [15] "MeadowV" "Edwards" "Timber" "Gilbert" "StoneBr" "ClearCr" "NPkVill"
## [22] "Blmngtn" "BrDale" "SWISU" "Blueste"
factorLevel$Neighborhood
## [1] "Blmngtn" "Blueste" "BrDale" "BrkSide" "ClearCr" "CollgCr" "Crawfor"
## [8] "Edwards" "Gilbert" "IDOTRR" "MeadowV" "Mitchel" "Names" "NoRidge"
## [15] "NPkVill" "NridgHt" "NWAmes" "OldTown" "SWISU" "Sawyer" "SawyerW"
## [22] "Somerst" "StoneBr" "Timber" "Veenker"
factorLevel$Neighborhood[13] <- "NAmes"
unique(house_train$BldgType)
## [1] "1Fam" "2fmCon" "Duplex" "TwnhsE" "Twnhs"
factorLevel$BldgType
## [1] "1Fam" "2FmCon" "Duplx" "TwnhsE" "TwnhsI"
factorLevel$BldgType[c(2,3,5)] <- c("2fmCon","Duplex","Twnhs")
unique(house_train$Exterior2nd)
## [1] "VinylSd" "MetalSd" "Wd Shng" "HdBoard" "Plywood" "Wd Sdng" "CmentBd"
## [8] "BrkFace" "Stucco" "AsbShng" "Brk Cmn" "ImStucc" "AsphShn" "Stone"
## [15] "Other" "CBlock"
factorLevel$Exterior2nd
## [1] "AsbShng" "AsphShn" "BrkComm" "BrkFace" "CBlock" "CemntBd" "HdBoard"
## [8] "ImStucc" "MetalSd" "Other" "Plywood" "PreCast" "Stone" "Stucco"
## [15] "VinylSd" "Wd Sdng" "WdShing"
factorLevel$Exterior2nd[c(17,6,3)] <- c("Wd Shng","CmentBd","Brk Cmn")
## Get levels that only appear in the dataset
for (varname in names(factorLevel)) {
factorLevel[[varname]] <- intersect(factorLevel[[varname]],
unique(house_train[[varname]]))
}
## Re-run the previous cell to double check
## convert column datatype to numeric / factor
## On training dataset
for (varname in names(house_train)[-1]) {
if (varname %in% names(factorLevel)) {
house_train[[varname]] <- factor(house_train[[varname]],
levels = factorLevel[[varname]])
} else {
house_train[[varname]] <- as.numeric(house_train[[varname]])
}
}
## On testing dataset
for (varname in names(house_test)[-1]) {
if (varname %in% names(factorLevel)) {
house_test[[varname]] <- factor(house_test[[varname]],
levels = factorLevel[[varname]])
} else {
house_test[[varname]] <- as.numeric(house_test[[varname]])
}
}
house_train$Id <- NULL
rownames(house_test) <- house_test$Id
house_test$Id <- NULL
save(house_train, house_test, file = "./house_loaded.RData")
library(ggplot2)
library(gridExtra)
library(tabplot)
library(lsr)
library(corrplot)
library(dplyr)
rm(list = ls())
load("./house_loaded.RData")
## histogram on SalePrice
grid.arrange(ggplot(house_train) +
geom_histogram(aes(SalePrice), bins = 20),
ggplot(house_train) +
geom_histogram(aes(log(SalePrice + 1)), bins = 20),
ncol = 2)
## table plot all features on sortded SalePrice
colMtx <- matrix(names(house_train)[1:length(house_train)-1], nrow = 8)
for (i in 1:ncol(colMtx)) {
tableplot(house_train,
select_string = c(colMtx[,i], "SalePrice"),
sortCol = "SalePrice", decreasing = TRUE,
nBins = 30)
}
numeric_features <- names(house_train)[sapply(house_train, is.numeric)]
numeric_features <- numeric_features[-length(numeric_features)]
print(numeric_features)
## [1] "LotFrontage" "LotArea" "YearBuilt" "YearRemodAdd"
## [5] "MasVnrArea" "BsmtFinSF1" "BsmtFinSF2" "BsmtUnfSF"
## [9] "TotalBsmtSF" "X1stFlrSF" "X2ndFlrSF" "LowQualFinSF"
## [13] "GrLivArea" "BsmtFullBath" "BsmtHalfBath" "FullBath"
## [17] "HalfBath" "BedroomAbvGr" "KitchenAbvGr" "TotRmsAbvGrd"
## [21] "Fireplaces" "GarageYrBlt" "GarageCars" "GarageArea"
## [25] "WoodDeckSF" "OpenPorchSF" "EnclosedPorch" "X3SsnPorch"
## [29] "ScreenPorch" "PoolArea" "MiscVal" "MoSold"
## [33] "YrSold"
## correlation between continuous variables in training dataset - pearson
corr_numtran <- cor(house_train %>%
select(one_of(numeric_features, "SalePrice")),
method = "pearson",
use = "pairwise.complete.obs")
corrplot(corr_numtran, method = "color", order="hclust")
## correlation between continuous variables in test dataset - pearson
# corr_numtest <- cor(house_test %>%
# select(one_of(numeric_features)),
# method = "pearson",
# use = "pairwise.complete.obs")
#
# corrplot(corr_numtest, method = "color", order="hclust")
## ordinal features are those who contain one of the follow levels
ordinal_levels <- c("Reg", "5", "TA", "No", "Unf",
"MnPrv", "Y", "Mod", "HLS", "1Fam")
ordinal_features <- character(0)
for(feature in names(house_train)) {
if(is.factor(house_train[,feature]) &&
length(intersect(ordinal_levels, levels(house_train[,feature])))) {
ordinal_features <- c(ordinal_features, feature)
}
}
print(ordinal_features)
## [1] "LotShape" "LandContour" "LandSlope" "BldgType"
## [5] "OverallQual" "OverallCond" "ExterQual" "ExterCond"
## [9] "BsmtQual" "BsmtCond" "BsmtExposure" "BsmtFinType1"
## [13] "BsmtFinType2" "HeatingQC" "CentralAir" "KitchenQual"
## [17] "Functional" "FireplaceQu" "GarageFinish" "GarageQual"
## [21] "GarageCond" "PavedDrive" "Fence"
## correlation between ordinal variables in training dataset - kendall
corr_ordtran <- cor(data.matrix(house_train %>%
select(one_of(ordinal_features))),
method = "kendall",
use = "pairwise.complete.obs")
corrplot(corr_ordtran, method = "color", order="hclust")
## correlation between ordinal variables in test dataset - kendall
# corr_ordtest <- cor(data.matrix(house_test %>%
# select(one_of(ordinal_features))),
# method = "kendall",
# use = "pairwise.complete.obs")
#
# corrplot(corr_ordtest, method = "color", order="hclust")
## Cramér's V is a measure of association between two nominal variables, giving a value between 0 and +1 (inclusive)
cor.cramersV <- function(data) {
cramersV(table(data[complete.cases(data),]))
}
nominal_features <- setdiff(names(house_train),
c(numeric_features, ordinal_features, "SalePrice"))
## cramers V in test dataset
corr_nomtran <- sapply(nominal_features,
function(x) sapply(nominal_features,
function(y) cor.cramersV(house_train[, c(x, y)])))
corrplot(corr_nomtran, method = "color", order="hclust")
## coorelation between ordered categorical variables in training - spearman
cor.ordcnt <- function(data, x, y) {
cor(as.numeric(data[[x]]), as.numeric(data[[y]]),
method = "spearman",
use = "pairwise.complete.obs")
}
corr_ordcnttran <- data.frame(Variable = ordinal_features,
Correlation = sapply(ordinal_features,
function(x) -cor.ordcnt(house_train, x, "SalePrice")))
ggplot(corr_ordcnttran, aes(reorder(Variable, Correlation), Correlation)) +
geom_bar(stat = "identity") +
coord_flip()
## Might be a good idea to convert some ordinal predictors to continuous
grid.arrange(
ggplot(house_train, aes(x = OverallQual, y = SalePrice)) + geom_boxplot(),
ggplot(house_train, aes(x = ExterQual, y = SalePrice)) + geom_boxplot(),
ggplot(house_train, aes(x = BsmtQual, y = SalePrice)) + geom_boxplot(),
ggplot(house_train, aes(x = KitchenQual, y = SalePrice)) + geom_boxplot(),
ggplot(house_train, aes(x = GarageFinish, y = SalePrice)) + geom_boxplot(),
ggplot(house_train, aes(x = FireplaceQu, y = SalePrice)) + geom_boxplot(),
ncol = 2
)
grid.arrange(
ggplot(house_train, aes(x = as.integer(OverallQual), y = SalePrice)) + geom_point(),
ggplot(house_train, aes(x = as.integer(ExterQual), y = SalePrice)) + geom_point(),
ggplot(house_train, aes(x = as.integer(BsmtQual), y = SalePrice)) + geom_point(),
ggplot(house_train, aes(x = as.integer(KitchenQual), y = SalePrice)) + geom_point(),
ggplot(house_train, aes(x = as.integer(GarageFinish), y = SalePrice)) + geom_point(),
ggplot(house_train, aes(x = as.integer(FireplaceQu), y = SalePrice)) + geom_point(),
ncol = 2
)
tableplot(house_train %>% select(one_of("SalePrice","OverallQual", "ExterQual", "BsmtQual",
"KitchenQual", "GarageFinish", "FireplaceQu")),
decreasing = TRUE,
nBins = 18,
colorNA = "#FF1414", colorNA_num = "#FF1414")
library(VIM)
library(caret)
library(RANN)
## check missing values
col_missing <- names(house_train)[colSums(is.na(house_train)) > 0]
aggr(house_train[,col_missing], prop = F, numbers = T)
Filter(function(x) x > 0, colSums(is.na(house_train)))
## LotFrontage MasVnrType MasVnrArea Electrical GarageYrBlt
## 259 8 8 1 81
col_missing <- names(house_test)[colSums(is.na(house_test)) > 0]
aggr(house_test[,col_missing], prop = F, numbers = T)
Filter(function(x) x > 0, colSums(is.na(house_test)))
## MSSubClass MSZoning LotFrontage Utilities Exterior1st
## 1 4 227 2 1
## Exterior2nd MasVnrType MasVnrArea BsmtFinSF1 BsmtFinSF2
## 1 16 15 1 1
## BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath KitchenQual
## 1 1 2 2 1
## Functional GarageYrBlt GarageCars GarageArea SaleType
## 2 78 1 1 1
## table plot all features on sortded SalePrice
tableplot(house_train %>% select(starts_with("Garage")),
decreasing = TRUE,
nBins = 18,
colorNA = "#FF1414", colorNA_num = "#FF1414")
tableplot(house_test %>% select(starts_with("Garage")),
decreasing = TRUE,
nBins = 19,
colorNA = "#FF1414", colorNA_num = "#FF1414")
grid.arrange(
ggplot(house_train,
aes(YearBuilt, ifelse(is.na(GarageYrBlt), YearBuilt, GarageYrBlt))) +
geom_point(aes(color = is.na(GarageYrBlt))),
ggplot(house_test,
aes(YearBuilt, ifelse(is.na(GarageYrBlt), YearBuilt, GarageYrBlt))) +
geom_point(aes(color = is.na(GarageYrBlt))),
ncol = 2
)
## Fix outlier
house_test$GarageYrBlt[which(house_test$GarageYrBlt == 2207)] <- 2007
## Create new feature: hasGarage
## Impute GarageYrBlt with YearBuilt
house_train <- house_train %>%
mutate(hasGarage = ifelse(is.na(GarageYrBlt), 0, 1),
GarageBlt = ifelse(is.na(GarageYrBlt), 0, GarageYrBlt - YearBuilt)) %>%
select(-GarageYrBlt)
house_test <- house_test %>%
mutate(hasGarage = ifelse(is.na(GarageYrBlt), 0, 1),
GarageBlt = ifelse(is.na(GarageYrBlt), 0, GarageYrBlt - YearBuilt)) %>%
select(-GarageYrBlt)
grid.arrange(
ggplot(house_train, aes(YearBuilt, GarageBlt)) +
geom_point(aes(color = as.factor(hasGarage))),
ggplot(house_test, aes(YearBuilt, GarageBlt)) +
geom_point(aes(color = as.factor(hasGarage))),
ncol = 2
)
grid.arrange(
ggplot(house_train, aes(YearBuilt, YearRemodAdd)) +
geom_point(aes(color = (YearRemodAdd == YearBuilt))),
ggplot(house_test, aes(YearBuilt, YearRemodAdd)) +
geom_point(aes(color = (YearRemodAdd == YearBuilt))),
ncol = 2
)
house_train <- house_train %>%
mutate(isRemod = ifelse(YearRemodAdd == YearBuilt, 0, 1),
RemodAdd = YearRemodAdd - YearBuilt) %>%
select(-YearRemodAdd)
house_test <- house_test %>%
mutate(isRemod = ifelse(YearRemodAdd == YearBuilt, 0, 1),
RemodAdd = YearRemodAdd - YearBuilt) %>%
select(-YearRemodAdd)
grid.arrange(
ggplot(house_train, aes(YearBuilt,RemodAdd)) +
geom_point(aes(color = as.factor(isRemod))),
ggplot(house_test, aes(YearBuilt, RemodAdd)) +
geom_point(aes(color = as.factor(isRemod))),
ncol = 2
)
nzv <- nearZeroVar(house_train[, -length(house_train)],
freqCut = 99/1,
uniqueCut = 5,
saveMetrics= TRUE)
nzv[nzv$nzv,]
## freqRatio percentUnique zeroVar nzv
## Street 242.3333 0.1369863 FALSE TRUE
## Utilities 1459.0000 0.1369863 FALSE TRUE
## Condition2 240.8333 0.5479452 FALSE TRUE
## RoofMatl 130.3636 0.5479452 FALSE TRUE
## LowQualFinSF 478.0000 1.6438356 FALSE TRUE
## X3SsnPorch 478.6667 1.3698630 FALSE TRUE
## PoolArea 1453.0000 0.5479452 FALSE TRUE
## PoolQC 484.3333 0.2739726 FALSE TRUE
## MiscVal 128.0000 1.4383562 FALSE TRUE
house_train[, rownames(nzv[nzv$nzv,])] <- NULL
house_test[, rownames(nzv[nzv$nzv,])] <- NULL
dim(house_train)
## [1] 1460 73
dim(house_test)
## [1] 1459 72
## use knn to impute all numerical varialbes
# preProc_numerical <- preProcess(house_train[, -length(house_train)],
# method = c("knnImpute"))
# print(preProc_numerical)
# house_train[, -length(house_train)] <- predict(preProc_numerical,
# house_train[, -length(house_train)])
# house_test <- predict(preProc_numerical, house_test)
## Caret only handles numerical features
## use kNN to impute categorical features
house_train[, -length(house_train)] <- kNN(house_train[, -length(house_train)],
k=5)[,names(house_train)[-length(house_train)]]
house_test <- kNN(house_test, k=5)[,names(house_test)]
## Double check missingness with the following code
aggr(house_train, prop = F, numbers = T)
aggr(house_test, prop = F, numbers = T)
SalePrice to log scale## Transform SalePrice to log scale
house_train$SalePrice <- log(house_train$SalePrice + 1)
save(house_train, house_test, file = "./house_preProc.RData")
rm(list = ls()) # clear workspace
library(caret)
load("./house_preProc.RData")
createDataPartiton## Perform single 80%/20% random split of house_train
library(caret)
set.seed(321)
trainIdx <- createDataPartition(house_train$SalePrice,
p = .8,
list = FALSE,
times = 1)
subTrain <- house_train[trainIdx,]
subTest <- house_train[-trainIdx,]
print(head(subTrain))
## MSSubClass MSZoning LotFrontage LotArea Alley LotShape LandContour
## 1 60 RL 65 8450 NA Reg Lvl
## 2 20 RL 80 9600 NA Reg Lvl
## 3 60 RL 68 11250 NA IR1 Lvl
## 4 70 RL 60 9550 NA IR1 Lvl
## 5 60 RL 84 14260 NA IR1 Lvl
## 6 50 RL 85 14115 NA IR1 Lvl
## LotConfig LandSlope Neighborhood Condition1 BldgType HouseStyle
## 1 Inside Gtl CollgCr Norm 1Fam 2Story
## 2 FR2 Gtl Veenker Feedr 1Fam 1Story
## 3 Inside Gtl CollgCr Norm 1Fam 2Story
## 4 Corner Gtl Crawfor Norm 1Fam 2Story
## 5 FR2 Gtl NoRidge Norm 1Fam 2Story
## 6 Inside Gtl Mitchel Norm 1Fam 1.5Fin
## OverallQual OverallCond YearBuilt RoofStyle Exterior1st Exterior2nd
## 1 7 5 2003 Gable VinylSd VinylSd
## 2 6 8 1976 Gable MetalSd MetalSd
## 3 7 5 2001 Gable VinylSd VinylSd
## 4 7 5 1915 Gable Wd Sdng Wd Shng
## 5 8 5 2000 Gable VinylSd VinylSd
## 6 5 5 1993 Gable VinylSd VinylSd
## MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## 1 BrkFace 196 Gd TA PConc Gd TA
## 2 None 0 TA TA CBlock Gd TA
## 3 BrkFace 162 Gd TA PConc Gd TA
## 4 None 0 TA TA BrkTil TA Gd
## 5 BrkFace 350 Gd TA PConc Gd TA
## 6 None 0 TA TA Wood Gd TA
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## 1 No GLQ 706 Unf 0 150
## 2 Gd ALQ 978 Unf 0 284
## 3 Mn GLQ 486 Unf 0 434
## 4 No ALQ 216 Unf 0 540
## 5 Av GLQ 655 Unf 0 490
## 6 No GLQ 732 Unf 0 64
## TotalBsmtSF Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 1 856 GasA Ex Y SBrkr 856 854
## 2 1262 GasA Ex Y SBrkr 1262 0
## 3 920 GasA Ex Y SBrkr 920 866
## 4 756 GasA Gd Y SBrkr 961 756
## 5 1145 GasA Ex Y SBrkr 1145 1053
## 6 796 GasA Ex Y SBrkr 796 566
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 NA
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 NA
## GarageType GarageFinish GarageCars GarageArea GarageQual GarageCond
## 1 Attchd RFn 2 548 TA TA
## 2 Attchd RFn 2 460 TA TA
## 3 Attchd RFn 2 608 TA TA
## 4 Detchd Unf 3 642 TA TA
## 5 Attchd RFn 3 836 TA TA
## 6 Attchd Unf 2 480 TA TA
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch ScreenPorch Fence
## 1 Y 0 61 0 0 NA
## 2 Y 298 0 0 0 NA
## 3 Y 0 42 0 0 NA
## 4 Y 0 35 272 0 NA
## 5 Y 192 84 0 0 NA
## 6 Y 40 30 0 0 MnPrv
## MiscFeature MoSold YrSold SaleType SaleCondition SalePrice hasGarage
## 1 NA 2 2008 WD Normal 12.24770 1
## 2 NA 5 2007 WD Normal 12.10902 1
## 3 NA 9 2008 WD Normal 12.31717 1
## 4 NA 2 2006 WD Abnorml 11.84940 1
## 5 NA 12 2008 WD Normal 12.42922 1
## 6 Shed 10 2009 WD Normal 11.87061 1
## GarageBlt isRemod RemodAdd
## 1 0 0 0
## 2 0 0 0
## 3 0 1 1
## 4 83 1 55
## 5 0 0 0
## 6 0 1 2
trainControlset.seed(456)
fitCtrl <- trainControl(method = "repeatedcv",
number = 5,
repeats = 3,
verboseIter = FALSE,
summaryFunction = defaultSummary)
First of all let’s build a multiple linear model to use all predictors to predict SalePrice:
lmFit <- train(SalePrice ~., data = subTrain,
method = "lm")
# summary(lmFit)
## Call:
## lm(formula = .outcome ~ ., data = dat)
## ... ...
## Residual standard error: 0.1152 on 915 degrees of freedom
## Multiple R-squared: 0.935, Adjusted R-squared: 0.917
## F-statistic: 52 on 253 and 915 DF, p-value: < 2.2e-16
lmImp <- varImp(lmFit, scale = FALSE)
## lm variable importance
##
## only 20 most important variables shown (out of 253)
##
## Overall
## MSZoningRL 8.151
## MSZoningFV 7.881
## MSZoningRM 7.659
## MSZoningRH 6.977
## SaleConditionAbnorml 5.433
## OverallQual1 5.309
## LandContourBnk 4.292
Note: linear models use the absolute value of the t-statistic.
plot(lmImp, top = 20)
mean(lmFit$resample$RMSE)
## [1] 0.1738542
predicted <- predict(lmFit, subTest)
RMSE(pred = predicted, obs = subTest$SalePrice)
## [1] 0.1629591
ggplot(subTest, aes(x = exp(SalePrice)-1, y = exp(predicted)-1)) +
geom_point() +
coord_fixed()
trainenetGrid <- expand.grid(alpha = seq(0, 1, .1),
lambda = seq(0, .6, .01))
set.seed(1234) # for reproducibility
enetFit <- train(SalePrice ~ .,
data = subTrain,
method="glmnet",
metric="RMSE",
trControl=fitCtrl,
tuneGrid=enetGrid)
print(enetFit$bestTune)
## alpha lambda
## 18 0 0.17
plot(enetFit)
plot(enetFit, plotType = "level")
enetVarImp <- varImp(enetFit, scale = FALSE)
plot(enetVarImp, top = 20)
mean(enetFit$resample$RMSE)
## [1] 0.1487311
mean(enetFit$resample$RMSE)
## [1] 0.1487311
predicted <- predict(enetFit, subTest)
RMSE(pred = predicted, obs = subTest$SalePrice)
## [1] 0.1368344
subTest$predicted <- predict(enetFit, subTest)
ggplot(subTest, aes(x = SalePrice, y = predicted)) + geom_point()
fitCtrl <- trainControl(method = "cv",
number = 5,
verboseIter = TRUE,
summaryFunction=defaultSummary)
gbmGrid <- expand.grid( n.trees = seq(100,1000,100),
interaction.depth = seq(1,10,2),
shrinkage = c(0.1),
n.minobsinnode = 10)
gbmFit <- train(SalePrice ~ .,
data = subTrain,
method = "gbm",
trControl = fitCtrl,
tuneGrid=gbmGrid,
metric='RMSE',
maximize=FALSE)
plot(gbmFit)
gbmFit$bestTune
## n.trees interaction.depth shrinkage n.minobsinnode
## 16 600 3 0.1 10
mean(gbmFit$resample$RMSE)
## [1] 0.1372074
predicted <- predict(gbmFit, subTest)
RMSE(pred = predicted, obs = subTest$SalePrice)
## [1] 0.1279916
ggplot(subTest, aes(x = exp(SalePrice)-1, y = exp(predicted)-1)) +
geom_point() +
coord_fixed()
| Model | RMSE |
|---|---|
| Linear Regression (all variables) | 0.1630 |
| Linear Regression with Regularization (Lasso) | 0.1368 |
| Gradient Boosting Machines | 0.1280 |